
import time
import requests
from bs4 import BeautifulSoup
import pymysql

class CS(object):

    def __init__(self):

        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
            'cookie': ''
        }

    def get_data(self):

        # 连接mysql数据库
        conn = pymysql.connect(host='localhost', user='root', password='root', database='house')
        # 创建游标对象
        cursor = conn.cursor()

        for p in range(1, 34):  # 1到34页
            url = 'https://cs.anjuke.com/sale/p' + str(p) + '/?from=esf_list'
            print('url: ', url)

            response = requests.get(url, headers=self.headers)
            time.sleep(3)
            soup = BeautifulSoup(response.text, 'lxml')
            soup_item = soup.select('.property')
            for si in soup_item:
                data = {}
                data['title'] = si.select('h3')[0].text.replace('\n', '').strip()  # 标题
                data['house_type'] = si.select('.property-content-info-text')[0].text.replace('\n', '').strip()  # 户型
                data['area'] = si.select('.property-content-info-text')[1].text.replace('\n', '').strip()  # 面积
                data['award'] = si.select('.property-content-info-text')[2].text.replace('\n', '').strip()  # 朝向

                try:
                    data['floor'] = si.select('.property-content-info-text')[3].text.replace('\n', '').strip()  # 楼层
                except:
                    data['floor'] = ''

                try:
                    data['jz_date'] = si.select('.property-content-info-text')[4].text.replace('\n', '').strip()  # 建造时间
                except:
                    data['jz_date'] = ''

                data['total_price'] = si.select('.property-price-total')[0].text.replace('\n', '').strip()  # 总价
                data['unit_price'] = si.select('.property-price-average')[0].text.replace('\n', '').strip()  # 单价
                data['xq_name'] = si.select('.property-content-info-comm-name')[0].text.replace('\n', '').strip()  # 小区名
                data['address'] = si.select('.property-content-info-comm-address')[0].text.replace('\n', '').strip()  # 地址

                tag = ''
                for t in si.select('.property-content-info-tag'):
                    tag += t.text + ','
                data['tag'] = tag  # 标签

                data['url'] = si.select('a')[0]['href']
                print(data)
                # 存到mysql数据库
                sql = "INSERT INTO house(title, house_type, area, award, floor, jz_date, total_price, unit_price, xq_name, address) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
                cursor.execute(sql, (data['title'], data['house_type'], data['area'], data['award'], data['floor'],
                data['jz_date'], data['total_price'], data['unit_price'], data['xq_name'], data['address']))
                # 提交事务
                conn.commit()



    def run(self):
        self.get_data()


if __name__ == '__main__':
    c = CS()
    c.run()



